
### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: (run below or see 'exec_time.csv')
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime

### Script purpose: code turnover variables by recording the months of first and last observations for each civil servant and 
### annualizing them

### Execution time: ~10 minutes (see 'exec_time.csv')

### Inputs: 
# 1) /data/intermediate/country_07_limpiar_cubertura.[format1]


### Outputs:
# 1) /data/intermediate/09_country_limpiar_rotacion.[format1]



#
# SET-UP --------------------------------------------------------------------------------------------
#

### Source the '00_global.R' script with required packages and functions
source(file.path(dirname(rstudioapi::getActiveDocumentContext()$path), '00_country_global.R'))


# library(installr)
# updateR()

# Make a copy of the file
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_ARCHIVE', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)


t0 = Sys.time() # record start time

#
# READ DATA -------------------------------------------------------------------------------------------------------------------
#




t0 = Sys.time() # record start time

# get columns used in this script
col_names = names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_07_limpiar_cubertura.parquet")))

col_select1  = col_names[sapply(col_names, function(c) any(grepl(c, tolower(readLines(rstudioapi::getSourceEditorContext()$path)))))]
col_select1

country_rotacion = read_flex(file.path(main_dir, 'data', 'intermediate', "country_07_limpiar_cubertura"),
                           format = format1, col_select = col_select1)

## set as country_rotacion if not already done
if(!any(grepl('data.table', class(country_rotacion)))){setensos(country_rotacion)}
gc()

beep()



# '  ----------------------------------------------------------------------------------------------------------------------------------------------------
#  MOBILITY VARIABLES  -------------------------------------------------------------------------------------------------------------------------------
#

#### re-run org_appear_df -----------------------------------------------------------------------------------------------------
org_appear_df = unique(country_rotacion[, .(organismo_nombre_clean, anyo_mes)])

org_appear_df[, org_appear_date := min_miss(anyo_mes), by = .(organismo_nombre_clean)]
org_appear_df[, org_disappear_date := max_miss(anyo_mes), by = .(organismo_nombre_clean)]


### all dates (months) organization observed (following filtering above)
setorder(org_appear_df, organismo_nombre_clean, anyo_mes)


org_appear_df[, anyo_mes_org_lag := as.numeric(difftime(anyo_mes, shift(anyo_mes, type = "lag"), units = "days")),
              by = .(organismo_nombre_clean)]
org_appear_df[, anyo_mes_org_lead := as.numeric(difftime(shift(anyo_mes, type = "lead"), anyo_mes, units = "days")),
              by = .(organismo_nombre_clean)]

### first and last day org observed
# org_appear_agg = unique(org_appear_df[, .(organismo_nombre_clean, org_appear_date, org_disappear_date)])
# 

country_rotacion[, c("cubertura", "org_appear_date", "org_disappear_date") := NULL]
country_rotacion = org_appear_df[country_rotacion, on = .(organismo_nombre_clean, anyo_mes)]


summary(country_rotacion$anyo_mes_org_lead)





### id_appear / id_disappear ----------------------------------------------------------------------------------------------
country_rotacion[, c("id_mes_min", "id_mes_max") := .(
  min(anyo_mes, na.rm = TRUE),
  max(anyo_mes, na.rm = TRUE)
), by = person_id]


country_rotacion[, c("id_mes_min_org", "id_mes_max_org") := .(
  min(anyo_mes, na.rm = TRUE),
  max(anyo_mes, na.rm = TRUE)
), by = .(person_id, organismo_nombre_clean)]



### appear [hire] -----------------------------------------------------------------------------------------------------------------------------------
# code the first appearance after initial date (>= 1 month after start_date1)
country_rotacion$day_lag_org

country_rotacion[, id_appear := fifelse(test = (
  (!is.na(id_mes_min) & # if no-missing min day
     (id_mes_min - org_appear_date >= 28) &  # if first month at least >1 month before org first appears (which equals first month of data for most organizations)
     anyo_mes == id_mes_min) |  
    (!is.na(day_lag) & day_lag > max_miss(c(364, day_lag_org)))   # OR if LAGGING day difference min. 1 year
    ),
    yes = 1,
    no  = 0)]

country_rotacion[, id_appear_org := fifelse(test = (
  (!is.na(id_mes_min_org) & # if no-missing min day
     (id_mes_min_org - org_appear_date >= 28) &  # if first month at least >1 month before org first appears (which equals forst month of data for most organizations)
     anyo_mes == id_mes_min_org) | 
    (!is.na(day_lag_org) & day_lag_org > max_miss(c(364, day_lag_org)))  # OR if LAGGING day difference min. 1 year
    ), 
    yes = 1,
    no  = 0)]

country_rotacion[, id_appear_org := fifelse(id_appear == 1, 1, id_appear_org)]  # OR if appearing in the public sector as a whole (e.g. day_lag_org 
# might be missing as person is new to the org, but there might be a general lag, even if >364 days a person might have been working somewhere else)

### checks
table(country_rotacion$id_appear, country_rotacion$id_appear_org, useNA='ifany')
# table(country_rotacion$id_appear, country_rotacion_save$id_appear, useNA='ifany')
# table(country_rotacion$id_appear_org, country_rotacion_save$id_appear_org, useNA='ifany')

country_rotacion[ , .N, by = .(id_appear, id_appear_org)]




### disappear [leaver] -----------------------------------------------------------------------------------------------------------------------------

# code the last appearance before final date (>= 1 month before end_date1)
country_rotacion[, id_disappear := fifelse(test = (
  (!is.na(id_mes_max) & # if no-missing max day
     (org_disappear_date - id_mes_max > 31) & # if max month >31 (1 month) before org disappear (which equals last month of data for most organizations)
     anyo_mes == id_mes_max) | 
    (!is.na(day_lead) & day_lead > max_miss(c(364, day_lag_org))) # OR if LEADING day difference min. 1 year
),  
yes = 1,
no  = 0)]


country_rotacion[, id_disappear_org := fifelse( (
  (!is.na(id_mes_max_org) & # if no-missing max day
     (org_disappear_date - id_mes_max_org > 31) & # if max month >31 (1 month) before org disappear (which equals last month of data for most organizations)
     anyo_mes == id_mes_max_org) | 
    (!is.na(day_lead_org) & day_lead_org > max_miss(c(364, day_lag_org))) # OR if LEADING day difference min. 1 year
),  
yes = 1,
no  = 0)]


country_rotacion[, id_disappear_org := fifelse(id_disappear == 1, 1, id_disappear_org)]  # OR if disappearing from the public sector as a whole (e.g. some people
# are not coded as organization leavers because their org disappears, but then the pop out a few years later in different org - should only concern ~100 people)


### checks
table(country_rotacion$id_disappear, country_rotacion$id_disappear_org, useNA='ifany')
# table(country_rotacion$id_disappear, country_rotacion_save$id_disappear, useNA='ifany')
# table(country_rotacion$id_disappear_org, country_rotacion_save$id_disappear_org, useNA='ifany')

gc()




# municipalidad de curanilahue; municipalidad de iquique, asociacion metropolitana de municipalidades de santiago sur para la gestion ambiental y de residuos (msur)
org1 = 'municipalidad de valdivia'
id1 = country_rotacion[country_rotacion$organismo_nombre_clean == org1,]
id1[, uniqueN(person_id), by = .(anyo_mes)] %>% arrange(anyo_mes)

# ancud - 21 - to large of a gap


# View(country_rotacion[country_rotacion$person_id == id1$person_id[2], ])



### annual values -------------------------------------------------------------------------------------------------------------------------------
gc()

# id1 = readRDS(file.path('data', 'id_check.rds')) # problematic IDs (only for checking)
# temp3 = country_rotacion[country$person_id %in% id2$person_id[21], ]

country_rotacion[, `:=`(
  hire_anyo       = fifelse(any(id_appear == 1), 'nuevo contratado', 'personal existente'),
  leaver_anyo     = fifelse(any(id_disappear == 1),  'egrasado', 'personal existente'),
  hire_leaver_anyo = fifelse(id_appear + id_disappear > 0, 'nuevo o egrasado', 'personal existente')
),
by = .(anyo, person_id)]

country_rotacion[, `:=`(
  hire_anyo_org   = fifelse(any(id_appear_org == 1),  'nuevo contratado', 'personal existente'),
  leaver_anyo_org = fifelse(any(id_disappear_org == 1), 'egrasado', 'personal existente'),
  hire_leaver_anyo_org = fifelse(id_appear_org + id_disappear_org > 0, 'nuevo o egrasado', 'personal existente')
), 
by = .(anyo, organismo_nombre_clean, person_id)]





### checks > any missing values?
# country_rotacion[, pr_na(leaver_anyo != 'personal existente'), by = .(anyo)]
# country_rotacion[, pr_na(leaver_anyo_org != 'personal existente'), by = .(anyo)]
# 
# country_rotacion[, uniqueN(person_id[leaver_anyo != 'personal existente'])/uniqueN(person_id), by  = .(dataset, anyo)]
# 

country_rotacion[, `:=`(
  hire_anyo_org_only  = fifelse(any(hire_anyo != 'nuevo contratado' & hire_anyo_org == 'nuevo contratado'),
                                'nuevo contratado', 'personal existente'),
  leaver_anyo_org_only  = fifelse(any(leaver_anyo != 'egrasado' & leaver_anyo_org == 'egrasado'),
                                  'egrasado', 'personal existente'),
  hire_leaver_anyo_org_only  = fifelse(any(hire_leaver_anyo != 'nuevo o egrasado' & hire_leaver_anyo_org == 'nuevo o egrasado'), 
                                       'nuevo o egrasado', 'personal existente')
), by = .(anyo, person_id)]



## checks
# should also disappear from org (but not the other way around), so left-bottom entry should be 0
### a small % of entries not matching the above - MANUALLY OVERRIDE? (for now)

country_rotacion[, hire_anyo_org := fifelse(hire_anyo == 'nuevo contratado',  'nuevo contratado', hire_anyo_org)]
country_rotacion[, leaver_anyo_org := fifelse(leaver_anyo == 'egrasado', 'egrasado', leaver_anyo_org)]
country_rotacion[, hire_leaver_anyo_org := fifelse(hire_leaver_anyo == 'nuevo o egrasado',  'nuevo o egrasadoo', hire_leaver_anyo_org)]


table(country_rotacion$id_appear, country_rotacion$id_appear_org, useNA = 'ifany')
table(country_rotacion$hire_anyo, country_rotacion$hire_anyo_org, useNA = 'ifany')

table(country_rotacion$hire_anyo, country_rotacion$hire_anyo_org, useNA = 'ifany')
table(country_rotacion$leaver_anyo, country_rotacion$leaver_anyo_org, useNA = 'ifany')


     
# need the last two to do month-level calculation

# country_rotacion %>% filter(leaver_anyo == 'egrasado' & leaver_anyo_org != 'egrasado') %>% View
# country_rotacion %>% filter(person_id == 'manuel eduardo reyes arias') %>% View

sf(country_rotacion$id_appear)
sf(country_rotacion$id_disappear)

country_rotacion[, `:=`(
  id_appear     = fifelse(id_appear == 1,  'nuevo contratado', 'personal existente'),
  id_disappear  = fifelse(id_disappear == 1,  'egrasado', 'personal existente'),
  id_appear_org     = fifelse(id_appear_org == 1,  'nuevo contratado', 'personal existente'),
  id_disappear_org  = fifelse(id_disappear_org == 1,  'egrasado', 'personal existente')
  )]








### merge with full data -----------------------------------------------------------------------------------------------------------------------------------------------
country_rotacion = country_rotacion %>% select(c(row_id_org, matches('_anyo'), 
                                             id_appear, id_disappear, id_appear_org, id_disappear_org))

### > save ----------------------------------------------------------------------------------------------------------------------------

gc()
write_flex(country_rotacion, file.path(main_dir, 'data', 'intermediate', "country_09_limpiar_rotacion"), format = format1)

beep('complete')

exec_time_fun('exec_time')





#
# FIN DEL CÓDIGO  --------------------------------------------------------------------------------------------
# 